library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
df = read.csv('cps.csv')
head(df)
cat("Number of instances : ",nrow(df))
## Number of instances :  534
cat("\nNumber of attributes : ",ncol(df))
## 
## Number of attributes :  11
str(df)
## 'data.frame':    534 obs. of  11 variables:
##  $ wage    : num  9 5.5 3.8 10.5 15 9 9.57 15 11 5 ...
##  $ educ    : int  10 12 12 12 12 16 12 14 8 12 ...
##  $ race    : chr  "W" "W" "W" "W" ...
##  $ sex     : chr  "M" "M" "F" "F" ...
##  $ hispanic: chr  "NH" "NH" "NH" "NH" ...
##  $ south   : chr  "NS" "NS" "NS" "NS" ...
##  $ married : chr  "Married" "Married" "Single" "Married" ...
##  $ exper   : int  27 20 4 29 40 27 5 22 42 14 ...
##  $ union   : chr  "Not" "Not" "Not" "Not" ...
##  $ age     : int  43 38 22 47 58 49 23 42 56 32 ...
##  $ sector  : chr  "const" "sales" "sales" "clerical" ...

1) From the given cps dataset,

a)Find the distribution of wage

## Kernel Density Plot
density_plot = ggplot(df, aes(wage)) + geom_density(fill='indianred3') + 
              labs(x = 'wage', y = 'density', title = 'Kernal density of the brain weight')

density_plot

b)Find the distribution of wage with respect to race

## Bar-plot
bar_plot = plot_ly(data=df, x=~race, y=~wage, type='bar')
bar_plot

c)Is there a correlation between age and wage ??

x = df$wage ## numeric
y = df$age  ## integer
cat(cor(x, y, method = c("pearson")))
## 0.1769669

d)Does the wage differ with marital status ??

## Bar-plot
bar_plot = plot_ly(data=df, x=~married, y=~wage, color=~sex, type='bar')
bar_plot %>% layout(
                title = list(text = 'Age vs Wage'),
                legend = list(title = 'Gender'),
                xaxis = list(text = 'Age'),
                yaxis = list(text = 'Wage')
          )
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

2) Using Iris Data-Set

a) Plot the relation between sepal length and sepal width.

Differentiate the different species and also show the variation in the sepal length in the graph

df2 = iris
plot = ggplot(data = df2, aes(x=Sepal.Length, y=Sepal.Width, color=Sepal.Length, shape=Species)) + geom_point() +
      labs(title = 'Sepal Length and Sepal Width',
           x = 'Sepal Length',
           y = 'Sepal Width')
plot

b) Use subplots and plot the relationship between the different species and the other attributes

ggpairs(df2, columns=1:4, upper = 'blank', aes(color=Species)) + ggtitle('IRIS')

4) Using the iris data-set

a) Create a new column for Sepal Length to Petal Length ratio

df4 = iris

df4 %>% mutate(ratio = Sepal.Length / Petal.Length)
head(df4)

b) Select all the columns except Species column

df4 %>% select(-Species)

c) Select only the rows where Sepal width is greater than

df4 %>% filter(Sepal.Width > 3.5)